Jupyter Notebook of the Iris Flower Dataset
Notebook is by Solomon Sonya 0xSolomonSonya
Some code and data cells in this notebook may have been augmented from ChatGPT, Copilot, Gemini, other Generative AI models, and online resources.

We will model our data in Phases:¶

  • Phase 1: Prepare and Clean the Dataset.
  • Phase 2: Explore the Dataset
  • Phase 3: Modeling & Classification
  • Phase 4: Model Evaluation
  • Phase 5: Model Selection & Deployment

Data Wrangling:¶

  • Augmentation (add new row or column)
  • Subsetting (Filter based on condition)
  • Cleaning (drop_na, fill_na, imputation, outliers)
  • Aggregating (Groupby.mean())
  • Transforming (Scale, Standardize, Normalize)

Phase 1: Prepare and Clean the Data

imports¶

In [1]:
from sklearn.datasets import load_iris
from sklearn import datasets

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statistics as stat
import scipy.stats as st
import scipy as sp
import os
#import scikitplot as skplt
import datetime
from tabulate import tabulate

import sklearn
import plotly.graph_objs as go
import ipywidgets as widgets
import math
import statsmodels
import warnings
import io
import inspect
import sys
import traceback

from scipy.stats.mstats import winsorize
from IPython.display import display, HTML
from sklearn.preprocessing import RobustScaler

#scale the data via z-score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

#preprocesing
#learning and prediction algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier, Perceptron
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
from statsmodels.graphics.gofplots import ProbPlot
from statsmodels.formula.api import ols
import statsmodels.api as sm
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.linear_model import Lasso, LassoCV, RidgeCV, ElasticNetCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVR
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder  # Optional for categorical labels
from sklearn.metrics import classification_report
from sklearn.inspection import permutation_importance
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.linear_model import RidgeClassifier

# models
from sklearn import ensemble
from sklearn import linear_model
from sklearn import naive_bayes
from sklearn import tree
from sklearn import gaussian_process
from sklearn import neighbors
from sklearn import svm
from xgboost import XGBRegressor
from sklearn import discriminant_analysis
from sklearn import neural_network
from sklearn import calibration

#from lightgbm import LGBMClassifier
#from catboost import CatBoostClassifier


# save and import trained models
import pickle

# Deep Learning
#import tensorflow as tf
#from tensorflow import keras
#from sklearn.datasets import make_classification

# PCA
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D

#importing  [Bagging]
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor

#importing  [Boosting]
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
import xgboost
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis

# model tuning
from sklearn.model_selection import GridSearchCV

# evaluation metrics
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, r2_score
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve
from sklearn.metrics import make_scorer,mean_squared_error, r2_score, mean_absolute_error
from sklearn.metrics import median_absolute_error, mean_absolute_percentage_error, rand_score
from sklearn.metrics import jaccard_score, dcg_score, consensus_score, d2_absolute_error_score
from sklearn.metrics import d2_pinball_score, d2_tweedie_score, davies_bouldin_score
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import auc, precision_recall_curve
from sklearn.metrics import cohen_kappa_score
from kneed import KneeLocator
from sklearn.metrics import silhouette_score
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score



import numpy as np 
from sklearn.linear_model import Perceptron
from sklearn.model_selection import KFold




#update system
#!pip install --upgrade scikit-learn
#!pip install xgboost
#model tuning

%matplotlib inline
#to ignore warnings
warnings.filterwarnings("ignore")
print("imports complete.")
imports complete.
In [2]:
#pip install kneed

image.png
image source: https://peaceadegbite1.medium.com/iris-flower-classification-60790e9718a1, Iris Flower Classification, Peace Ikeoluwa Adegbite, Retrieved: 2025-02-04

load dataset¶

In [3]:
iris = load_iris()

convert data to dataframe:¶

2-dimensional datastructure used for storing tabular data (rows and columns). Each column is a specific attribute (or feature), each row is an entire instance (aka object, sample, data point, observation)¶

In [4]:
df_iris = pd.DataFrame(data=iris.data, columns=iris.feature_names)

view dataframe¶

In [5]:
df_iris
Out[5]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
... ... ... ... ...
145 6.7 3.0 5.2 2.3
146 6.3 2.5 5.0 1.9
147 6.5 3.0 5.2 2.0
148 6.2 3.4 5.4 2.3
149 5.9 3.0 5.1 1.8

150 rows × 4 columns

count number of values that are negative¶

In [6]:
negative_mask = df_iris < 0

total_negative_values = negative_mask.sum().sum()

print("Total number of negative values:", total_negative_values)
Total number of negative values: 0

view columns¶

In [7]:
print(list(df_iris.columns))
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']

rename columns, eliminate whitespaces¶

In [8]:
            #   old name: new name
col_rename = {'sepal length (cm)': 'sepal_length',
              'sepal width (cm)' : 'sepal_width',
              'petal length (cm)': 'petal_length',
              'petal width (cm)' : 'petal_width' }

# df_iris = df_iris.rename(columns=col_rename)
# or
df_iris.rename(columns=col_rename, inplace=True)

view first 6 instances in dataframe¶

In [9]:
df_iris.head(6)
Out[9]:
sepal_length sepal_width petal_length petal_width
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
5 5.4 3.9 1.7 0.4

augment classLabel to the dataframe¶

In [10]:
df_iris['class_label'] = iris.target

sample 4 random instances (without replacement)¶

In [11]:
df_iris.sample(4)
Out[11]:
sepal_length sepal_width petal_length petal_width class_label
81 5.5 2.4 3.7 1.0 1
8 4.4 2.9 1.4 0.2 0
128 6.4 2.8 5.6 2.1 2
97 6.2 2.9 4.3 1.3 1

create mapping & augment with flower category name¶

In [12]:
map_col_name = { 0: 'Iris-setosa',
                 1: 'Iris-versicolor',
                 2: 'Iris-virginica'}

df_iris['category'] = df_iris['class_label'].apply(lambda x: map_col_name[x])

sample 10 instances (with replacement)¶

In [13]:
df_iris.sample(10, replace=True)
Out[13]:
sepal_length sepal_width petal_length petal_width class_label category
114 5.8 2.8 5.1 2.4 2 Iris-virginica
79 5.7 2.6 3.5 1.0 1 Iris-versicolor
125 7.2 3.2 6.0 1.8 2 Iris-virginica
103 6.3 2.9 5.6 1.8 2 Iris-virginica
84 5.4 3.0 4.5 1.5 1 Iris-versicolor
103 6.3 2.9 5.6 1.8 2 Iris-virginica
133 6.3 2.8 5.1 1.5 2 Iris-virginica
96 5.7 2.9 4.2 1.3 1 Iris-versicolor
65 6.7 3.1 4.4 1.4 1 Iris-versicolor
70 5.9 3.2 4.8 1.8 1 Iris-versicolor

save the full dataset¶

In [14]:
# make directory
os.makedirs('./data', exist_ok=True)

# save data
df_iris.to_csv('./data/iris.csv', sep=',', index=False)

Phase 2: Explore the dataset

review the dataset¶

In [15]:
df_iris.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   class_label   150 non-null    int64  
 5   category      150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB
In [16]:
# ensure index is within range of expected number of instances
df_iris.index
Out[16]:
RangeIndex(start=0, stop=150, step=1)
In [17]:
# view shape in format (rows, cols) ==> rows == # of instances, cols == # of features
df_iris.shape
Out[17]:
(150, 6)
In [18]:
# aggregaion function to summarize number values within the dataset
# we use .T for transpose (i.e., swap rows with cols)
df_iris.describe().T
Out[18]:
count mean std min 25% 50% 75% max
sepal_length 150.0 5.843333 0.828066 4.3 5.1 5.80 6.4 7.9
sepal_width 150.0 3.057333 0.435866 2.0 2.8 3.00 3.3 4.4
petal_length 150.0 3.758000 1.765298 1.0 1.6 4.35 5.1 6.9
petal_width 150.0 1.199333 0.762238 0.1 0.3 1.30 1.8 2.5
class_label 150.0 1.000000 0.819232 0.0 0.0 1.00 2.0 2.0

'pretty-print' of dataframe¶

In [19]:
# this can be a very useful function to print contents of dataframe when the default printing scheme doesn't display the df in an easy to read manner
def print_df(dataframe):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        print(tabulate(dataframe, headers='keys', tablefmt='pretty'))

print_df(df_iris.describe().T)
+--------------+-------+--------------------+--------------------+-----+-----+------+-----+-----+
|              | count |        mean        |        std         | min | 25% | 50%  | 75% | max |
+--------------+-------+--------------------+--------------------+-----+-----+------+-----+-----+
| sepal_length | 150.0 | 5.843333333333334  | 0.8280661279778629 | 4.3 | 5.1 | 5.8  | 6.4 | 7.9 |
| sepal_width  | 150.0 | 3.0573333333333337 | 0.435866284936698  | 2.0 | 2.8 | 3.0  | 3.3 | 4.4 |
| petal_length | 150.0 | 3.7580000000000005 | 1.7652982332594667 | 1.0 | 1.6 | 4.35 | 5.1 | 6.9 |
| petal_width  | 150.0 | 1.1993333333333336 | 0.7622376689603465 | 0.1 | 0.3 | 1.3  | 1.8 | 2.5 |
| class_label  | 150.0 |        1.0         | 0.8192319205190405 | 0.0 | 0.0 | 1.0  | 2.0 | 2.0 |
+--------------+-------+--------------------+--------------------+-----+-----+------+-----+-----+

view unique categories¶

In [20]:
print(df_iris['category'].unique())
['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']

view how many instances exist for each species category¶

In [21]:
df_iris['category'].value_counts()
Out[21]:
category
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64

count number of empty/null instances¶

In [22]:
df_iris.isnull().sum()
Out[22]:
sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
class_label     0
category        0
dtype: int64

filter df for specific species (using dot notation)¶

In [23]:
df_iris[df_iris['category'] == 'Iris-versicolor'].tail(5)
Out[23]:
sepal_length sepal_width petal_length petal_width class_label category
95 5.7 3.0 4.2 1.2 1 Iris-versicolor
96 5.7 2.9 4.2 1.3 1 Iris-versicolor
97 6.2 2.9 4.3 1.3 1 Iris-versicolor
98 5.1 2.5 3.0 1.1 1 Iris-versicolor
99 5.7 2.8 4.1 1.3 1 Iris-versicolor
In [24]:
# df_pairplot = df_iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'category']] # we'll do this later :-)

alternatively filter df for specific species (using dot notation)¶

In [25]:
df_iris[df_iris.category == 'Iris-versicolor'].tail(5)
Out[25]:
sepal_length sepal_width petal_length petal_width class_label category
95 5.7 3.0 4.2 1.2 1 Iris-versicolor
96 5.7 2.9 4.2 1.3 1 Iris-versicolor
97 6.2 2.9 4.3 1.3 1 Iris-versicolor
98 5.1 2.5 3.0 1.1 1 Iris-versicolor
99 5.7 2.8 4.1 1.3 1 Iris-versicolor
In [26]:
df_iris
Out[26]:
sepal_length sepal_width petal_length petal_width class_label category
0 5.1 3.5 1.4 0.2 0 Iris-setosa
1 4.9 3.0 1.4 0.2 0 Iris-setosa
2 4.7 3.2 1.3 0.2 0 Iris-setosa
3 4.6 3.1 1.5 0.2 0 Iris-setosa
4 5.0 3.6 1.4 0.2 0 Iris-setosa
... ... ... ... ... ... ...
145 6.7 3.0 5.2 2.3 2 Iris-virginica
146 6.3 2.5 5.0 1.9 2 Iris-virginica
147 6.5 3.0 5.2 2.0 2 Iris-virginica
148 6.2 3.4 5.4 2.3 2 Iris-virginica
149 5.9 3.0 5.1 1.8 2 Iris-virginica

150 rows × 6 columns


VISUALIZE THE DATA

univariate analysis

boxplots¶

In [27]:
# single attribute - boxplot vertical is the default
df_iris['sepal_length'].plot(kind="box");
No description has been provided for this image
In [28]:
# display boxplot horizontally
df_iris['sepal_length'].plot(kind="box", vert=False);
No description has been provided for this image

adjust image size¶

In [29]:
fig, ax = plt.subplots(figsize=(10, 4))  # Width = 10 inches, Height = 4 inches

# Create the horizontal box plot on the specified axes
df_iris['sepal_length'].plot(kind="box", vert=False, ax=ax)

# Optional: Add a title
ax.set_title('Sepal Length Distribution')

# Optional: Customize x-axis label (since it's horizontal)
ax.set_xlabel('Sepal Length (cm)')

plt.show()
No description has been provided for this image
In [30]:
# combine single boxplots
df_iris.drop('class_label', axis=1).plot(kind="box");
No description has been provided for this image
In [31]:
#df_iris.boxplot()
In [32]:
df_iris.boxplot(column='sepal_length', by='category');
No description has been provided for this image
In [33]:
df_iris.drop('class_label', axis=1).boxplot(by='category', figsize=(10,15), sharex=False, sharey=False);
plt.subplots_adjust(hspace=0.25)
No description has been provided for this image

Observation Petal Length and Petal Width may be better discriminators as we can start to see where decision boundaries could be

In [34]:
#sns.boxplot(data=df_iris, x='category', y='sepal_length')

kde¶

In [35]:
df_iris.drop('class_label', axis=1).plot(kind='kde');

# more plots to try later
# df_iris.plot(kind=?)
#line, bar, barh, hist, box, kde, density, area, pie, scatter, hexbin, etc
No description has been provided for this image
In [36]:
# specify list of features (aka attributes)
lst_attributes = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

sns.kdeplot(data=df_iris[lst_attributes], fill=True);
No description has been provided for this image

histogram¶

In [37]:
df_iris['sepal_length'].hist(edgecolor='black');
No description has been provided for this image
In [38]:
# adjust figure
df_iris['sepal_length'].hist(alpha=0.7, rwidth=0.93, edgecolor='black');
No description has been provided for this image
In [39]:
# kde plots an estimate of the probability density funciton of a continuous random variable - we can use this to visualize the desnsity
sns.kdeplot(df_iris['sepal_length'], fill=True);
No description has been provided for this image

plot hist and smoothed kde¶

In [40]:
df_iris['sepal_length'].hist(alpha=0.7, rwidth=0.9, edgecolor='black', density=True);
sns.kdeplot(df_iris['sepal_length'], color='red', fill=False);
No description has been provided for this image

add median and mean¶

if mean > median: i.e., mean is to the RIGHT of median --> positive skew == right-skewed
if mean < median: i.e., mean is to the LEFT of median --> negative skew == left-skewed
if Mean ≈ Median: i.e., mean closely equal to median --> data is nearly symmetric == normal distribution

In [41]:
mean_sepal_len = df_iris['sepal_length'].mean()
median_sepal_len = df_iris['sepal_length'].median()

# Create a histogram and KDE plot
fig, ax = plt.subplots()

ax.hist(df_iris['sepal_length'], bins=10, alpha=0.7, rwidth=0.9, edgecolor='black', density=True)
sns.kdeplot(df_iris['sepal_length'], color='red', fill=False);

# Add mean and median line
ax.axvline(mean_sepal_len, color='orange', linestyle='--', label=f'Mean: {mean_sepal_len:.2f}')
ax.axvline(median_sepal_len, color='green', linestyle='-', label=f'Median: {median_sepal_len:.2f}')

ax.legend()

plt.show();
No description has been provided for this image

iterate through all attributes and plot histogram on subplots¶

In [42]:
# create 2X2 grid for subplots
#fig, axes = plt.subplots(2, 2, figsize=(10, 8), sharex=True, sharey=True)
fig, axes = plt.subplots(2, 2, figsize=(10, 8)) # uncomment above to share same x and y scale

# flatten subplot for iteration
axes = axes.flatten()

# specify list of features (aka attributes)
#lst_attributes = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

# plot histogram in each subplot
for i, attr in enumerate(lst_attributes):
    axes[i].hist(df_iris[attr], bins=10, edgecolor='black', alpha=0.7)
    axes[i].set_title(f'Histogram of {attr}')
    axes[i].set_xlabel(attr)
    axes[i].set_ylabel('Frequency')
    axes[i].tick_params(axis='both', which='both', labelsize=10)

plt.tight_layout()
plt.show()
No description has been provided for this image

add kde, mean, and median¶

In [43]:
# create 2X2 grid for subplots

# uncomment out below if you wish to share x and y data points
#fig, axes = plt.subplots(2, 2, figsize=(15, 8), sharex=True, sharey=True)
fig, axes = plt.subplots(2, 2, figsize=(10, 8), sharex=False, sharey=False)

# flatten subplot for iteration
axes = axes.flatten()

# specify list of features (aka attributes)
lst_attributes = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

# plot histogram in each subplot
for i, attr in enumerate(lst_attributes):
    axes[i].hist(df_iris[attr], bins=10, edgecolor='black', alpha=0.7, density=True)
    sns.kdeplot(df_iris[attr], ax=axes[i], color='red', linewidth=2, alpha=0.8)
    axes[i].set_title(f'Histogram of {attr}')
    axes[i].set_xlabel(attr)
    axes[i].set_ylabel('Density')
    axes[i].tick_params(axis='both', which='both', labelsize=10)

    # add mean and median line
    mean_attr = df_iris[attr].mean()
    median_attr = df_iris[attr].median()

    axes[i].axvline(mean_attr, color='orange', linestyle='--', label=f'Mean: {mean_attr:.2f}')
    axes[i].axvline(median_attr, color='green', linestyle='-', label=f'Median: {median_attr:.2f}')

    axes[i].legend()


plt.tight_layout()
plt.show()
No description has been provided for this image
In [44]:
# create 2X2 grid for subplots
fig, axes = plt.subplots(2, 2, figsize=(10, 8), sharex=False, sharey=False)

# flatten subplot for iteration
axes = axes.flatten()

# specify list of features (aka attributes)
lst_attributes = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

# plot histogram in each subplot
for i, attr in enumerate(lst_attributes):
    sns.histplot(df_iris[attr], ax=axes[i], kde=True) #kde=True adds the line    
    #or
    #sns.histplot(df_iris[attr], ax=axes[i], color="skyblue")  # Histogram color
    #sns.kdeplot(df_iris[attr], ax=axes[i], color="red")  # KDE line color

    axes[i].set_title(f'Histogram of {attr}')
    axes[i].set_xlabel(attr)
    axes[i].set_ylabel('Frequency')
    axes[i].tick_params(axis='both', which='both', labelsize=10)

    # add mean and median line
    mean_attr = df_iris[attr].mean()
    median_attr = df_iris[attr].median()

    axes[i].axvline(mean_attr, color='orange', linestyle='--', label=f'Mean: {mean_attr:.2f}')
    axes[i].axvline(median_attr, color='green', linestyle='-', label=f'Median: {median_attr:.2f}')

    axes[i].legend()


plt.tight_layout()
plt.show()
No description has been provided for this image

bivariate analysis

In [45]:
df_iris.plot(kind="scatter", x='sepal_length', y='sepal_width');
No description has been provided for this image
In [46]:
# show univariate histogram and bivariate scatterplots in the same plot
sns.jointplot(data=df_iris, x='sepal_length', y='sepal_width')
Out[46]:
<seaborn.axisgrid.JointGrid at 0x7f675ddd3690>
No description has been provided for this image

add hue to better distinguish classes¶

In [47]:
sns.scatterplot(data=df_iris, x='sepal_length', y='sepal_width', hue='category');
No description has been provided for this image
In [48]:
sns.jointplot(data=df_iris, x='sepal_length', y='sepal_width', hue='category');
No description has been provided for this image
In [49]:
# linear model plot --> scatter plot with a regression line
# shaded region represents the 95% confidence interval (which is directly related to standard error of measurement)
sns.lmplot(data=df_iris, x='sepal_length', y='sepal_width', hue='category');
No description has been provided for this image

let's use an aggregation function to look at the correlation of features¶

In [50]:
correlation_matrix = df_iris.corr(numeric_only=True)
correlation_matrix
Out[50]:
sepal_length sepal_width petal_length petal_width class_label
sepal_length 1.000000 -0.117570 0.871754 0.817941 0.782561
sepal_width -0.117570 1.000000 -0.428440 -0.366126 -0.426658
petal_length 0.871754 -0.428440 1.000000 0.962865 0.949035
petal_width 0.817941 -0.366126 0.962865 1.000000 0.956547
class_label 0.782561 -0.426658 0.949035 0.956547 1.000000

visualize correlation via heatmap¶

In [51]:
plt.figure(figsize=(8, 6))  # Adjust width and height as needed
sns.heatmap(correlation_matrix, annot=True, cmap='PuBu');
No description has been provided for this image
In [52]:
# bivariate analysis on pair of features with a strong correlation
sns.scatterplot(data=df_iris, x='petal_length', y='petal_width', hue='category', edgecolor='gray');
No description has been provided for this image

shows relationship between sepal_length numerical value is distributed across different species categories¶

In [53]:
# unlike scatter plots, swarmplot does not let the same values overlap to help visualize the distribution
# this chart helps us visualize datapoints of greatest concentration
sns.boxplot(data=df_iris, x='category', y='sepal_length');
sns.swarmplot(data=df_iris, x='category', y='sepal_length');
No description has been provided for this image

View centrality of the data¶

In [54]:
# Create a 2x2 subplot layout
fig, axes = plt.subplots(2, 2, figsize=(14, 14))
axes = axes.flatten()

# Plot boxplot and swarmplot for each feature
for i, feature in enumerate(lst_attributes):
    sns.boxplot(ax=axes[i], data=df_iris, x='category', y=feature)
    sns.swarmplot(ax=axes[i], data=df_iris, x='category', y=feature)
    axes[i].set_title(f'{feature.capitalize()} by Category')
    axes[i].set_xlabel('Category')
    axes[i].set_ylabel(feature)


plt.subplots_adjust(wspace=0.3, hspace=0.4)
#plt.tight_layout()
plt.show()
No description has been provided for this image
In [55]:
#fig, axes = plt.subplots(2, 2, figsize=(14, 14))
plt.figure(figsize=(14,14))

plt.subplot(2,2,1)
sns.violinplot(x='category',y='sepal_length',data=df_iris)
sns.swarmplot(x='category', y='sepal_length', data=df_iris)

plt.subplot(2,2,2)
sns.violinplot(x='category',y='sepal_width',data=df_iris)
sns.swarmplot(x='category', y='sepal_width', data=df_iris)

plt.subplot(2,2,3)
sns.violinplot(x='category',y='petal_length',data=df_iris)
sns.swarmplot(x='category', y='petal_length', data=df_iris)

plt.subplot(2,2,4)
sns.violinplot(x='category',y='petal_width',data=df_iris);
sns.swarmplot(x='category', y='petal_width', data=df_iris);
No description has been provided for this image

Filter dataframe by class and Plot historgram for each class¶

In [56]:
# Get the unique classes
unique_classes = df_iris['category'].unique()

lst_drop_features = ['category', 'class_label']

    
# Set up the figure with subplots
plt.figure(figsize=(16, 20))  # Adjusted size to fit all subplots

for i in range(len(unique_classes)):
    # Get label for the current class
    label = unique_classes[i]
    
    # Filter the data for the current class
    df_label = df_iris[df_iris['category'] == label].drop(lst_drop_features, axis=1)

    # Calculate the correlation matrix
    correlation_matrix = df_label.corr()

    # Plot the heatmap
    plt.subplot(len(unique_classes), 2, 2 * i + 1)
    sns.heatmap(correlation_matrix, annot=True, cmap='PuBu', cbar=True)
    plt.title(f'Correlation Matrix for Category {label}')

    # Plot the boxplot with swarmplot overlay
    plt.subplot(len(unique_classes), 2, 2 * i + 2)
    sns.boxplot(data=df_label, ax=plt.gca(), palette="Blues")
    for col in df_label.columns:
        sns.swarmplot(x=[col]*len(df_label), y=df_label[col], ax=plt.gca(), edgecolor='black', color='red')
    plt.title(f'Boxplots with Swarm for Category {label}')
    

# Adjust layout to add space between rows
plt.subplots_adjust(hspace=0.2) 

#plt.tight_layout()
plt.show()
No description has been provided for this image

bivariate analysis via pairplot¶

In [57]:
#<br> !!! Beware!!! too many features may end up taking significant time to process. 
# filter specific features
df_pairplot = df_iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'category']]
In [58]:
%%time
sns.pairplot(df_pairplot, diag_kind='kde', hue='category');
CPU times: user 2.47 s, sys: 0 ns, total: 2.47 s
Wall time: 3.3 s
No description has been provided for this image

Bivariate analysis with histogram across the diagonal¶

In [59]:
%%time
sns.pairplot(df_pairplot, diag_kind='hist', hue='category');
CPU times: user 2.68 s, sys: 11.4 ms, total: 2.69 s
Wall time: 3.55 s
No description has been provided for this image

View the Mean and Standard Deviaiton of each feature per class¶

In [60]:
# Calculate the mean and standard deviation for each feature grouped by category
mean_values = df_pairplot.groupby('category').mean()
std_values = df_pairplot.groupby('category').std()

# Set up the plot
plt.figure(figsize=(12, 8))

# Plotting the mean and standard deviation for each feature across the x-axis
for i, category in enumerate(mean_values.index):
    plt.errorbar(mean_values.columns, mean_values.iloc[i], yerr=std_values.iloc[i], 
                 label=f'{category}', marker='o', capsize=5)

# Customizing the plot
plt.xlabel('Features')
plt.ylabel('Value')
plt.title('Mean and Standard Deviation for Each Category')
plt.legend(title='Species')
plt.grid(True)
plt.show()
No description has been provided for this image

Plot same data, fill in standard deviation¶

In [61]:
# Calculate the mean and standard deviation for each feature grouped by category
mean_values = df_pairplot.groupby('category').mean()
std_values = df_pairplot.groupby('category').std()

# Set up the plot
plt.figure(figsize=(12, 8))

# Plotting the mean and filling the standard deviation area for each category
for i, category in enumerate(mean_values.index):
    plt.plot(mean_values.columns, mean_values.iloc[i], marker='o', label=f'{category}')
    plt.fill_between(mean_values.columns,
                     mean_values.iloc[i] - std_values.iloc[i],
                     mean_values.iloc[i] + std_values.iloc[i],
                     alpha=0.2)  # Alpha controls the transparency of the fill

# Customizing the plot
plt.xlabel('Features')
plt.ylabel('Value')
plt.title('Mean and Standard Deviation for Each Category')
plt.legend(title='Species')
plt.grid(True)
plt.show()
No description has been provided for this image

Combine both plots to toggle when to show error bar range¶

In [62]:
plot_error_bar = True
plot_std_error = False

# Calculate the mean and standard deviation for each feature grouped by category
mean_values = df_pairplot.groupby('category').mean()
std_values = df_pairplot.groupby('category').std()
std_err_values = std_values / np.sqrt(df_pairplot.groupby('category').count())

# determine which bar to plot
lst_error_bar = std_values

if plot_std_error:
    lst_error_bar = std_err_values

# Set up the plot
plt.figure(figsize=(12, 8))

# Plotting the mean and filling the standard deviation area for each category
for i, category in enumerate(mean_values.index):
    if plot_error_bar:
        plt.errorbar(mean_values.columns, mean_values.iloc[i], yerr=lst_error_bar.iloc[i], 
                 label=f'{category}', marker='o', capsize=5)
    else:
        plt.plot(mean_values.columns, mean_values.iloc[i], marker='o', label=f'{category}')
    plt.fill_between(mean_values.columns,
                     mean_values.iloc[i] - lst_error_bar.iloc[i],
                     mean_values.iloc[i] + lst_error_bar.iloc[i],
                     alpha=0.2)  # Alpha controls the transparency of the fill

# Customizing the plot
plt.xlabel('Features')
plt.ylabel('Value')
plt.title('Mean and Standard Deviation for Each Category')
plt.legend(title='Species')
plt.grid(True)
plt.show()
No description has been provided for this image

much more time is spent in EDA. You can view more chart ideas:¶

  • https://seaborn.pydata.org/examples/index.html
  • https://matplotlib.org/stable/gallery/index.html

Helper Functions

Display Confusion Matrix¶

In [63]:
def display_confusion_matrix(model_NAME, conf_mtx, lst_class_labels):
    os.makedirs('./confusion_matrix', exist_ok=True)        
            
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_mtx, annot=True, fmt='d', cmap='Blues', cbar=False, xticklabels=lst_unique_class_names, yticklabels=lst_unique_class_names)
    plt.title(model_NAME + ' Confusion Matrix')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
        
    # Save the figure
    plt.savefig('confusion_matrix/confusion_matrix_' + str(model_NAME) + '.png', dpi=300, bbox_inches='tight')
    
    # Show the plot
    plt.show()

Model Learning Curve¶

In [64]:
def model_learning_curve(model_name, X, y, num_trials, test_size_override, alpha, show_learning_curve):
    # set lerning curves to compute at varying amounts of training data
    #lst_training_size = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80]
    lst_training_proportion = [0.05, 0.10, 0.20, 0.40, 0.60, 0.80]

    # notify
    print(f'commencing evaluation of model [{model_name}]')
    
    # set test size
    testSize = test_size_override

    if testSize < .05 or testSize > .99:
        testSize = 0.1
    
    # set num trials
    numTrials = num_trials

    if numTrials < 1:
        numTrials = 1
    
    # create list to store eval scores ffor training and test sets
    lst_eval_scores_TRAINING_SET = []
    lst_std_dev_TRAINING_SET = []
    lst_std_err_TRAINING_SET = []
    
    
    lst_eval_scores_TEST_SET = []
    lst_std_dev_TEST_SET = []
    lst_std_err_TEST_SET = []
    

    continue_modeling = True

        
    # outter loop: training size
    for i in range(len(lst_training_proportion)):
        # set training size percentage   
        trainSize = lst_training_proportion[i]
        
        # init eval scores at this percentage (index)
        total_eval_score_TRAINING = 0
        total_eval_score_TEST = 0

        lst_training_scores_for_this_proportion = []
        lst_test_scores_for_this_proportion = []

        if not continue_modeling:
            break
    
        # iterate num trails, store average eval score
        for j in range(numTrials):
            
            # establish training/testing hold-out
            X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=testSize,train_size=trainSize,shuffle=True) 
    
            # initialize model
            
            ################################################################################
            # General Linear Models
            ################################################################################
            if model_name == 'logistic_regression':
                mdl = LogisticRegression() # use default parameters            
            elif model_name == 'perceptron':
                mdl = linear_model.Perceptron()
            elif model_name == 'sgd_classifier':
                mdl = SGDClassifier()
            elif model_name == 'passive_aggressive':
                mdl = linear_model.PassiveAggressiveClassifier()
            elif model_name == 'ridge_classifier':
                mdl = RidgeClassifier()

            ################################################################################
            # SVM
            ################################################################################
            elif model_name == 'linear_svc':
                mdl = svm.LinearSVC()
            elif model_name == 'svc':
                mdl = svm.SVC(probability=True, kernel='linear')
            #elif model_name == 'svc_nu':
            #    mdl = svm.NuSVC(probability=True, kernel='linear')

            # tree
            elif model_name == 'decision_tree':
                mdl = DecisionTreeClassifier()

            ################################################################################
            # ensemble
            ################################################################################
            elif model_name == 'random_forest':
                mdl = RandomForestClassifier() # random_state=42
            elif model_name == 'extra_tree':
                mdl = ExtraTreesClassifier()
            elif model_name == 'bagging_classifier':
                mdl = BaggingClassifier()
            elif model_name == 'gradient_boosting':
                mdl = ensemble.GradientBoostingClassifier() # random_state=42
            elif model_name == 'ada_boosting':
                mdl = ensemble.AdaBoostClassifier()
            elif model_name == 'hist_boosting':
                mdl = HistGradientBoostingClassifier()

            ################################################################################
            # Gaussian Processes
            ################################################################################
            elif model_name == 'gaussian_process':
                mdl = gaussian_process.GaussianProcessClassifier()

            ################################################################################
            # NBC probabilistic
            ################################################################################
            elif model_name == 'nbc_gaussian':
                mdl = GaussianNB()
            elif model_name == 'nbc_bernoulli':
                mdl = naive_bayes.BernoulliNB()

            ################################################################################
            # neighbors
            ################################################################################
            elif model_name == 'knn':
                mdl = KNeighborsClassifier()

            ################################################################################
            # discriminant analysis
            ################################################################################
            #elif model_name == 'quadratic_discriminant':
            #    mdl = QuadraticDiscriminantAnalysis()
            elif model_name == 'linear_discriminant':
                mdl = LinearDiscriminantAnalysis()

            ################################################################################
            # neural network
            ################################################################################
            elif model_name == 'mlp':
                mdl = MLPClassifier(hidden_layer_sizes=(100), max_iter=500)  

            ################################################################################
            # ELSE
            ################################################################################
            else:
                print(f'\n\n ERROR! I am not configured to test model {model_name}\n\n')
                continue_modeling = False
                break
    
                                    
            
            # validate modeling
            if not continue_modeling:
                continue
            
            ################################################################################
            # train the model to learn the training set
            ################################################################################
            mdl.fit(X_train, y_train) 
    
            # use score helper function to apply trained model to the test set and return accuracy. then accumulate model eval performance
            eval_score_TEST = mdl.score(X_test,y_test)

            # store specific score
            lst_test_scores_for_this_proportion.append(eval_score_TEST)

            # accumulate total for the mean
            total_eval_score_TEST = total_eval_score_TEST + eval_score_TEST
            
            eval_score_TRAINING = mdl.score(X_train,y_train)

            # store specific score for this num trial
            lst_training_scores_for_this_proportion.append(eval_score_TRAINING)

            # accumulate total for the mean
            total_eval_score_TRAINING = total_eval_score_TRAINING + eval_score_TRAINING

        #############################################################################################
        # done with num trials (inner loop) at this training size, store average evaluation score
        ############################################################################################
        if continue_modeling:
            lst_eval_scores_TEST_SET.append(total_eval_score_TEST/numTrials)
            lst_eval_scores_TRAINING_SET.append(total_eval_score_TRAINING/numTrials)

            # calculate std dev
            std_dev_training = np.std(lst_training_scores_for_this_proportion, ddof=1)
            std_dev_test = np.std(lst_test_scores_for_this_proportion, ddof=1)
            
            # store std dev
            lst_std_dev_TRAINING_SET.append(std_dev_training)
            lst_std_dev_TEST_SET.append(std_dev_test)

            # std err
            lst_std_err_TRAINING_SET.append(std_dev_training / np.sqrt(len(lst_std_dev_TRAINING_SET))) # len(lst_std_dev_TRAINING_SET) should == numtrials
            lst_std_err_TEST_SET.append(std_dev_test / np.sqrt(len(lst_std_dev_TEST_SET))) # len(lst_std_dev_TEST_SET) should == numtrials
                                            

            

    ###############################################################################################
    # done with outter loop
    ###############################################################################################
    if continue_modeling:
               

        # calculate score ratio as a way of quantitatively determine overfitting 
        lst_score_ratio = np.array(lst_eval_scores_TRAINING_SET) / np.array(lst_eval_scores_TEST_SET)
        lst_score_ratio

        # we use paired t-test becauses the underlying data is similar
        t_statistic, p_value = st.ttest_rel(lst_eval_scores_TRAINING_SET, lst_eval_scores_TEST_SET)

        disposition = 'fail to reject null hypothesis; observed difference between 2 samples are not statistically significant'
        if p_value < alpha:
            disposition = ' reject null hypothesis; observed difference between 2 samples are statistically significant'


        # plot learning curves
        if show_learning_curve:
            plt.figure(figsize=(10, 6))
            plt.plot(lst_training_proportion,lst_eval_scores_TEST_SET, label='Test Set', marker='o')
            plt.plot(lst_training_proportion,lst_eval_scores_TRAINING_SET, label='Training Set', marker='x')
            plt.xlabel('Training Size')
            plt.ylabel('Evaluation Score')
            plt.title('Learning Curve for model ' + str(model_name))
            plt.legend()
            plt.grid(True)
            plt.show();

        # notify
        print(f"\nTraining complete for model {model_name}!")
        try:
            # Format values to 4 decimal places
            # Round values to 4 decimal places
            rounded_training_set = [round(score, 4) for score in lst_eval_scores_TRAINING_SET]
            rounded_test_set = [round(score, 4) for score in lst_eval_scores_TEST_SET]

            rounded_std_dev_train_set = [round(score, 4) for score in lst_std_dev_TRAINING_SET]
            rounded_std_err_train_set = [round(score, 4) for score in lst_std_err_TRAINING_SET]
            
            rounded_std_dev_test_set = [round(score, 4) for score in lst_std_dev_TEST_SET]
            rounded_std_err_test_set = [round(score, 4) for score in lst_std_err_TEST_SET]
            
            rounded_score_ratio = [round(score, 4) for score in lst_score_ratio]
            rounded_p_value = round(p_value, 4)
            
            # Print rounded lists
            print("Training set:", rounded_training_set)
            print("Test set:    ", rounded_test_set)
            print('------------------------------------')
            print("Train stddev:", rounded_std_dev_train_set)
            print("Train stderr:", rounded_std_err_train_set)                       
            print("Test stddev:", rounded_std_dev_test_set)
            print("Test stderr:", rounded_std_err_test_set)            
            print('------------------------------------')
            print("Score Ratio: ", rounded_score_ratio)
            print("Trn/Tst p-value:", rounded_p_value)
            print('p-value disposition:', disposition)
            print('=========================================================================================================================================\n')
        except:        
            print("Training set:", lst_eval_scores_TRAINING_SET)
            print("Test set:    ", lst_eval_scores_TEST_SET)
            print('------------------------------------')
            print("Train stddev:", lst_std_dev_TRAINING_SET)
            print("Train stderr:", lst_std_err_TRAINING_SET)                        
            print("Test stddev:", lst_std_dev_TEST_SET)
            print("Test stderr:", lst_std_err_TEST_SET)            
            print('------------------------------------')
            print("Score Ratio: ", lst_score_ratio)
            print("Trn/Tst p-value:", p_value)
            print('p-value disposition:', disposition)
            print('=========================================================================================================================================\n')
        
        

        return lst_training_proportion, lst_eval_scores_TRAINING_SET, lst_std_dev_TRAINING_SET, lst_std_err_TRAINING_SET, lst_eval_scores_TEST_SET, lst_std_dev_TEST_SET, lst_std_err_TEST_SET, lst_score_ratio, p_value, disposition

#lst_training_proportion, lst_eval_scores_TRAINING_SET, lst_std_dev_TRAINING_SET, lst_std_err_TRAINING_SET, lst_eval_scores_TEST_SET, lst_std_dev_TEST_SET, lst_std_err_TEST_SET, lst_score_ratio, p_value, disposition = model_learning_curve('logistic_regression', X, y, 5, 0.1, 0.05, True)

Phase 3: Model & Classify the Data

Create target variable label and X features¶

In [65]:
y = df_iris.class_label
X = df_iris[['sepal_length','sepal_width','petal_length','petal_width']]
y_class_names = df_iris['category']

y = y_class_names
y_label_index = iris.target

lst_unique_class_names = list(iris.target_names)

View Matrix X and array (list) y values¶

In [66]:
# X is the matrix representing our entire features and instances
X
Out[66]:
sepal_length sepal_width petal_length petal_width
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
... ... ... ... ...
145 6.7 3.0 5.2 2.3
146 6.3 2.5 5.0 1.9
147 6.5 3.0 5.2 2.0
148 6.2 3.4 5.4 2.3
149 5.9 3.0 5.1 1.8

150 rows × 4 columns

In [67]:
# y is the array representing our label. The index of y corresponds to the instance in X.
# e.g. the instance at index 1 is labeled to be an Iris-setosa flower
y
Out[67]:
0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
            ...      
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Name: category, Length: 150, dtype: object

Create Hold-out set¶

In [68]:
# bifurcate data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20,train_size=0.8)

# NOTE 1: we could just say test_size = 0.2 and leave off ‘train_size such that the rest (80%) would be training?
# NOTE 2: we could also state test_size=0.20, train_size=0.50 such that 50% of the data would randomly be assigned to training, and 30% of the data would be excluded? (possibly for validation set)

# We now have dataframes for training, and series for testing corresponding to the label for each instance in the dataframes?

# refer to https://scikit-learn.org/stable/api/ for model types, categories, hyperparameters, and configurations to implement additional models

determine datasypes¶

In [69]:
print(f'X_train is of type: {type(X_train)}')
print(f'y_train is of type: {type(y_train)}\n')
print(f'X_test is of type: {type(X_test)}')
print(f'y_test is of type: {type(y_test)}')
X_train is of type: <class 'pandas.core.frame.DataFrame'>
y_train is of type: <class 'pandas.core.series.Series'>

X_test is of type: <class 'pandas.core.frame.DataFrame'>
y_test is of type: <class 'pandas.core.series.Series'>

Let's begin modeling!¶

The general format is similar to the following:

  • [INSTANTIATION] instantiate model, set hyperparameters (if applicable)
  • [TRAINING TIME] fit the model to the training set (i.e., X_train) - this is where learning happens such that weights/hyperplanes/etc are trained to the dataset
  • [INFERENCE TIME] once we train the model, we move to inference such that the model is used to predict instances in the test set, i.e., X_test
  • [EVALUATION] Evaluate performance of the trained model against the test set (to determine generalization). Common by measuring loss of the trained model to the ground truth (labels). Multiple evaluation techniques exist depending on the model representation
    • Confusion Matrix
    -TP, FP, TN, FN
    • Accuracy, Recall, Precision, F_n Score (e.g., F1-Score)
    • AOC (Area under the Curve) and ROC (Receiver Operating Characteristic)
    • Negative Predictive Rate, etc
  • [INSPECT] Inspect the model performance e.g.:
    • Classification Report
    • View Feature Importance, etc
  • [SIMPLIFY] Simplify the model or retrain if needed
  • [DEPLOY] Deploy the model if satisfied with generality of the model

Logistic Regression¶

instantiate model¶

In [70]:
# set randomstate hyperparam for reproducability
model_LR = LogisticRegression(random_state=42)

view hyperparameters to this model¶

In [71]:
print(model_LR.get_params())
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': 42, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}

[TRAINING] train model

In [72]:
model_LR.fit(X_train, y_train)
Out[72]:
LogisticRegression(random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(random_state=42)

[INFERENCE] predict test set

In [73]:
y_pred = model_LR.predict(X_test)

Phase 4: Model Evaluation

Evaluation Predictions¶

In [74]:
# apply trained model to test set to determine how well it might generalize to unseen data in the future
accuracy_LR = accuracy_score(y_test, y_pred) # can also do accuracy_LR = model_LR.score(X_test, y_test)
precision_LR = precision_score(y_test, y_pred, average='weighted')
recall_LR = recall_score(y_test, y_pred, average='weighted')
f1_score_LR = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy_LR:.4f}')
print(f'Precision: {precision_LR:.4f}')
print(f'Recall: {recall_LR:.4f}')
print(f'F1-Score: {f1_score_LR:.4f}')
Accuracy: 0.9333
Precision: 0.9333
Recall: 0.9333
F1-Score: 0.9333

View Confusion Matrix, Evaluation Prediction Performance, Create Classification Report¶

In [75]:
# make directory
os.makedirs('./classification_report', exist_ok=True)

mdl = model_LR
mdl_name = 'Logistic_Regression'

confusion_mtx_LR = confusion_matrix(y_test, y_pred)


try:
    # convert classification report to dictionary to then convert to df
    class_report_dict_LR = classification_report(y_test, y_pred, output_dict=True)  # This might raise an error

    # Convert dictionary to df
    df_class_report_LR = pd.DataFrame(class_report_dict_LR).transpose()

     # write to disk!
    df_class_report_LR.to_csv('classification_report/classification_report_' + mdl_name + '.csv', index=True)
except:
    print('Exception caught in classification report on model:', mdl_name)

confusion_mtx_LR
Out[75]:
array([[ 8,  0,  0],
       [ 0, 13,  1],
       [ 0,  1,  7]])
In [76]:
display_confusion_matrix(mdl_name, confusion_mtx_LR, lst_unique_class_names)
No description has been provided for this image

Summarize prediction¶

In [77]:
pd.DataFrame(y_pred).value_counts()
Out[77]:
0              
Iris-versicolor    14
Iris-setosa         8
Iris-virginica      8
Name: count, dtype: int64

Summarize Ground-Truth labels¶

In [78]:
pd.DataFrame(y_test).value_counts()
Out[78]:
category       
Iris-versicolor    14
Iris-setosa         8
Iris-virginica      8
Name: count, dtype: int64

View Feature Importance to this model¶

In [79]:
feature_names = X.columns
importances = np.abs(model_LR.coef_[0])  # Coefficients of the model

df_importances = pd.DataFrame({'Feature': feature_names,'Importance': importances}).sort_values(by='Importance', ascending=True)

# Plot
plt.figure(figsize=(10, 6))
plt.barh(df_importances['Feature'], df_importances['Importance'], edgecolor='black')
plt.xlabel('Importance')
plt.title('Feature Importance from Logistic Regression model')
plt.show()
No description has been provided for this image

View Classification Report¶

In [80]:
df_class_report_LR
Out[80]:
precision recall f1-score support
Iris-setosa 1.000000 1.000000 1.000000 8.000000
Iris-versicolor 0.928571 0.928571 0.928571 14.000000
Iris-virginica 0.875000 0.875000 0.875000 8.000000
accuracy 0.933333 0.933333 0.933333 0.933333
macro avg 0.934524 0.934524 0.934524 30.000000
weighted avg 0.933333 0.933333 0.933333 30.000000

NOTE: support indicates the number of ground-truth instances belonging to this class

Evaluate Cross Validation¶

In [81]:
#cv_score_accuracy_LR = cross_val_score(mdl, X_train, y_train, cv=10, scoring='accuracy', n_jobs=-1)
cv_score_accuracy_LR = cross_val_score(model_LR, X, y, cv=10, scoring='accuracy', n_jobs=-1)
cv_score_precision_LR = cross_val_score(model_LR, X, y, cv=10, scoring='precision_macro', n_jobs=-1)
cv_score_recall_LR = cross_val_score(model_LR, X, y, cv=10, scoring='recall_macro', n_jobs=-1)
cv_score_f1_score_LR = cross_val_score(model_LR, X, y, cv=10, scoring='f1_macro', n_jobs=-1)

print('cross-validation evaluation complete\n')
# use average='weighted': One way to account for label imbalance by calculating metrics for each label, average (weighted by support (the number of true instances for each label))

print(f'Accuracy: {cv_score_accuracy_LR.mean():.4f}')
print(f'Precision {cv_score_precision_LR.mean():.4f}')
print(f'Recall: {cv_score_recall_LR.mean():.4f}')
print(f'F1- Score: {cv_score_f1_score_LR.mean():.4f}')
cross-validation evaluation complete

Accuracy: 0.9733
Precision 0.9778
Recall: 0.9733
F1- Score: 0.9731

Investigate if Model is overfitting¶

We will do this by comparing learning rate of training data vs learning rate of test data

In [82]:
# set lerning curves to compute at varying amounts of training data
#lst_training_size = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80]
lst_training_size = [0.05, 0.10, 0.20, 0.40, 0.80]

# set test size
testSize = 0.1

# set num trials
numTrials = 50

# create list to store eval scores ffor training and test sets
lst_eval_scores_TRAINING_SET = []
lst_eval_scores_TEST_SET = []

# outter loop: training size
for i in range(len(lst_training_size)):
    # set training size percentage   
    trainSize = lst_training_size[i]
    
    # init eval scores at this percentage (index)
    total_eval_score_TRAINING = 0
    total_eval_score_TEST = 0

    # iterate num trails, store average eval score
    for j in range(numTrials):
        
        # establish training/testing hold-out
        X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=testSize,train_size=trainSize,shuffle=True) 

        # initialize mode
        model_LogReg = LogisticRegression() # use default parameters
        
        # train the model to learn the training set
        model_LogReg.fit(X_train, y_train) 

        # use score helper function to apply trained model to the test set and return accuracy. then accumulate model eval performance
        eval_score_TEST = model_LogReg.score(X_test,y_test)
        total_eval_score_TEST = total_eval_score_TEST + eval_score_TEST
        
        eval_score_TRAINING = model_LogReg.score(X_train,y_train)
        total_eval_score_TRAINING = total_eval_score_TRAINING + eval_score_TRAINING

     # done with num trials at this training size, store average evaluation score
    lst_eval_scores_TEST_SET.append(total_eval_score_TEST/numTrials)
    lst_eval_scores_TRAINING_SET.append(total_eval_score_TRAINING/numTrials)

# notify
print("Training complete!")
print("Training set:", lst_eval_scores_TRAINING_SET)
print("Test set:    ", lst_eval_scores_TEST_SET)

# plot learning curves
plt.figure(figsize=(10, 6))
plt.plot(lst_training_size,lst_eval_scores_TEST_SET, label='Test Set', marker='o')
plt.plot(lst_training_size,lst_eval_scores_TRAINING_SET, label='Training Set', marker='x')
plt.xlabel('Training Size')
plt.ylabel('Evaluation Score')
plt.title('Learning Curve')
plt.legend()
plt.grid(True)
plt.show();
Training complete!
Training set: [0.9428571428571425, 0.9519999999999996, 0.9826666666666669, 0.9730000000000003, 0.9743333333333339]
Test set:     [0.7826666666666666, 0.8959999999999997, 0.9373333333333328, 0.9559999999999996, 0.9666666666666663]
No description has been provided for this image

Compare if score from training set and test set are statistically different¶

In [83]:
# we use paired t-test becauses the underlying data is similar
alpha = 0.05
t_statistic, p_value = st.ttest_rel(lst_eval_scores_TRAINING_SET, lst_eval_scores_TEST_SET)

print(f'p-value is {p_value:.4f}')

if p_value < 0.05:
    print('Conclusion: We reject null hypothesis and conclude the difference between these 2 sets are statistically significant.\n\n')
else:
    print('Conclusion: We fail to reject the null hypothesis since we could not observe a statistically significant difference between the 2 sets.\n\n')
p-value is 0.1033
Conclusion: We fail to reject the null hypothesis since we could not observe a statistically significant difference between the 2 sets.


With an alpha of 0.05, we do not have enough evidence to reject the null hypothesis.
The null hypothesis states there is no observable difference between the 2 sets (or in other words, the null hypothesis states the differences observed are due to random noise and not due to a specific intervention).

Here, p_value > 0.05, therefore the 2 sets are quite similar. We don't use p-value directly to conclude overfitting vice underfitting, but to at least be close to conclude overfitting or underfitting, then we would expect the difference between these 2 sets to be statistically significant.
If we could determine statistical significance, then we could measure the relation of the training line compared to the test line to state over or underfitting.

Calculate Learning Curve Score Ratio¶

In [84]:
score_ratio = np.array(lst_eval_scores_TRAINING_SET) / np.array(lst_eval_scores_TEST_SET)
print(f'Ratio set: {score_ratio}')
print(f'Ratio average: {score_ratio.mean()}')
Ratio set: [1.20467267 1.0625     1.04836415 1.01778243 1.00793103]
Ratio average: 1.068250056927529

There are multiple ways to evaluate values in our learning curve. We will use the score ration to determine how the model is performing

  • Ratio Close to 1: If the ratio of training score to test score is close to 1 (e.g., between 0.95 and 1.05), it suggests that the model's performance on training and test sets is similar.
  • Moderate Ratio: Ratios between 1.05 and 1.20 might indicate moderate overfitting.
  • High Ratio: Ratios greater than 1.20 suggest significant overfitting, as the model performs much better on the training set than on the test set.

Train the Models to Graph Learning Curve of each models' Test Set¶

In [85]:
%%time
# re-init
X = df_iris[['sepal_length','sepal_width','petal_length','petal_width']]
y = df_iris.class_label
num_trials = 50


# logistic regression
lst_training_proportion_LOGISTIC_REGRESSION, lst_eval_scores_TRAINING_SET_LOGISTIC_REGRESSION, lst_std_dev_TRAINING_SET_LOGISTIC_REGRESSION, lst_std_err_TRAINING_SET_LOGISTIC_REGRESSION, lst_eval_scores_TEST_SET_LOGISTIC_REGRESSION, lst_std_dev_TEST_SET_LOGISTIC_REGRESSION, lst_std_err_TEST_SET_LOGISTIC_REGRESSION, lst_score_ratio_LOGISTIC_REGRESSION, p_value_LOGISTIC_REGRESSION, disposition_LOGISTIC_REGRESSION = model_learning_curve('logistic_regression', X, y, num_trials, 0.1, 0.05, False)

# nbc
lst_training_proportion_NBC, lst_eval_scores_TRAINING_SET_NBC, lst_std_dev_TRAINING_SET_NBC, lst_std_err_TRAINING_SET_NBC, lst_eval_scores_TEST_SET_NBC, lst_std_dev_TEST_SET_NBC, lst_std_err_TEST_SET_NBC, lst_score_ratio_NBC, p_value_NBC, disposition_NBC = model_learning_curve('nbc_gaussian', X, y, num_trials, 0.1, 0.05, False)

# svm
lst_training_proportion_SVC, lst_eval_scores_TRAINING_SET_SVC, lst_std_dev_TRAINING_SET_SVC, lst_std_err_TRAINING_SET_SVC, lst_eval_scores_TEST_SET_SVC, lst_std_dev_TEST_SET_SVC, lst_std_err_TEST_SET_SVC, lst_score_ratio_SVC, p_value_SVC, disposition_SVC = model_learning_curve('svc', X, y, num_trials, 0.1, 0.05, False)

# perceptron
lst_training_proportion_PERCEPTRON, lst_eval_scores_TRAINING_SET_PERCEPTRON, lst_std_dev_TRAINING_SET_PERCEPTRON, lst_std_err_TRAINING_SET_PERCEPTRON, lst_eval_scores_TEST_SET_PERCEPTRON, lst_std_dev_TEST_SET_PERCEPTRON, lst_std_err_TEST_SET_PERCEPTRON, lst_score_ratio_PERCEPTRON, p_value_PERCEPTRON, disposition_PERCEPTRON = model_learning_curve('perceptron', X, y, num_trials, 0.1, 0.05, False)

# mlp
lst_training_proportion_MLP, lst_eval_scores_TRAINING_SET_MLP, lst_std_dev_TRAINING_SET_MLP, lst_std_err_TRAINING_SET_MLP, lst_eval_scores_TEST_SET_MLP, lst_std_dev_TEST_SET_MLP, lst_std_err_TEST_SET_MLP, lst_score_ratio_MLP, p_value_MLP, disposition_MLP = model_learning_curve('mlp', X, y, num_trials, 0.1, 0.05, False)

# decision tree
lst_training_proportion_DECISION_TREE, lst_eval_scores_TRAINING_SET_DECISION_TREE, lst_std_dev_TRAINING_SET_DECISION_TREE, lst_std_err_TRAINING_SET_DECISION_TREE, lst_eval_scores_TEST_SET_DECISION_TREE, lst_std_dev_TEST_SET_DECISION_TREE, lst_std_err_TEST_SET_DECISION_TREE, lst_score_ratio_DECISION_TREE, p_value_DECISION_TREE, disposition_DECISION_TREE = model_learning_curve('decision_tree', X, y, num_trials, 0.1, 0.05, False)

print('\nDONE!')
commencing evaluation of model [logistic_regression]

Training complete for model logistic_regression!
Training set: [0.9314, 0.964, 0.9713, 0.971, 0.9736, 0.9742]
Test set:     [0.74, 0.8493, 0.9333, 0.936, 0.956, 0.964]
------------------------------------
Train stddev: [0.0877, 0.0451, 0.0278, 0.0174, 0.0134, 0.0076]
Train stderr: [0.0877, 0.0319, 0.016, 0.0087, 0.006, 0.0031]
Test stddev: [0.1566, 0.1188, 0.0774, 0.0631, 0.0497, 0.0409]
Test stderr: [0.1566, 0.084, 0.0447, 0.0316, 0.0222, 0.0167]
------------------------------------
Score Ratio:  [1.2587, 1.135, 1.0407, 1.0374, 1.0184, 1.0105]
Trn/Tst p-value: 0.0668
p-value disposition: fail to reject null hypothesis; observed difference between 2 samples are not statistically significant
=========================================================================================================================================

commencing evaluation of model [nbc_gaussian]

Training complete for model nbc_gaussian!
Training set: [0.9971, 0.9867, 0.9667, 0.9633, 0.9616, 0.959]
Test set:     [0.5827, 0.844, 0.944, 0.932, 0.9587, 0.968]
------------------------------------
Train stddev: [0.0202, 0.0301, 0.0309, 0.0196, 0.0103, 0.0077]
Train stderr: [0.0202, 0.0213, 0.0178, 0.0098, 0.0046, 0.0031]
Test stddev: [0.1974, 0.1344, 0.0622, 0.0639, 0.0444, 0.0452]
Test stderr: [0.1974, 0.0951, 0.0359, 0.0319, 0.0199, 0.0184]
------------------------------------
Score Ratio:  [1.7113, 1.169, 1.024, 1.0336, 1.003, 0.9907]
Trn/Tst p-value: 0.1899
p-value disposition: fail to reject null hypothesis; observed difference between 2 samples are not statistically significant
=========================================================================================================================================

commencing evaluation of model [svc]

Training complete for model svc!
Training set: [0.9743, 0.9827, 0.9773, 0.9793, 0.9849, 0.9878]
Test set:     [0.8187, 0.936, 0.96, 0.9667, 0.9773, 0.968]
------------------------------------
Train stddev: [0.0554, 0.0325, 0.0265, 0.0156, 0.0086, 0.007]
Train stderr: [0.0554, 0.023, 0.0153, 0.0078, 0.0039, 0.0029]
Test stddev: [0.1451, 0.0785, 0.0467, 0.0431, 0.0319, 0.0431]
Test stderr: [0.1451, 0.0555, 0.0269, 0.0216, 0.0143, 0.0176]
------------------------------------
Score Ratio:  [1.1901, 1.0499, 1.0181, 1.0131, 1.0077, 1.0205]
Trn/Tst p-value: 0.1204
p-value disposition: fail to reject null hypothesis; observed difference between 2 samples are not statistically significant
=========================================================================================================================================

commencing evaluation of model [perceptron]

Training complete for model perceptron!
Training set: [0.7314, 0.684, 0.708, 0.7397, 0.7267, 0.7505]
Test set:     [0.5653, 0.628, 0.6253, 0.7453, 0.736, 0.772]
------------------------------------
Train stddev: [0.2094, 0.1687, 0.1526, 0.1304, 0.1449, 0.1195]
Train stderr: [0.2094, 0.1193, 0.0881, 0.0652, 0.0648, 0.0488]
Test stddev: [0.2017, 0.1661, 0.1914, 0.1626, 0.1761, 0.1611]
Test stderr: [0.2017, 0.1175, 0.1105, 0.0813, 0.0788, 0.0658]
------------------------------------
Score Ratio:  [1.2938, 1.0892, 1.1322, 0.9924, 0.9873, 0.9722]
Trn/Tst p-value: 0.1903
p-value disposition: fail to reject null hypothesis; observed difference between 2 samples are not statistically significant
=========================================================================================================================================

commencing evaluation of model [mlp]

Training complete for model mlp!
Training set: [0.9943, 0.9947, 0.9907, 0.9853, 0.9858, 0.9797]
Test set:     [0.876, 0.9347, 0.9733, 0.9707, 0.9693, 0.9787]
------------------------------------
Train stddev: [0.0404, 0.0183, 0.0166, 0.0141, 0.009, 0.0072]
Train stderr: [0.0404, 0.0129, 0.0096, 0.0071, 0.004, 0.0029]
Test stddev: [0.1611, 0.0928, 0.0356, 0.0408, 0.0386, 0.0367]
Test stderr: [0.1611, 0.0656, 0.0206, 0.0204, 0.0173, 0.015]
------------------------------------
Score Ratio:  [1.135, 1.0642, 1.0178, 1.0151, 1.017, 1.001]
Trn/Tst p-value: 0.089
p-value disposition: fail to reject null hypothesis; observed difference between 2 samples are not statistically significant
=========================================================================================================================================

commencing evaluation of model [decision_tree]

Training complete for model decision_tree!
Training set: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
Test set:     [0.796, 0.9147, 0.9427, 0.9413, 0.9493, 0.9453]
------------------------------------
Train stddev: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Train stderr: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Test stddev: [0.1711, 0.0738, 0.0587, 0.0627, 0.0477, 0.0498]
Test stderr: [0.1711, 0.0522, 0.0339, 0.0313, 0.0213, 0.0203]
------------------------------------
Score Ratio:  [1.2563, 1.0933, 1.0608, 1.0623, 1.0534, 1.0578]
Trn/Tst p-value: 0.0172
p-value disposition:  reject null hypothesis; observed difference between 2 samples are statistically significant
=========================================================================================================================================


DONE!
CPU times: user 1min 17s, sys: 16.3 ms, total: 1min 17s
Wall time: 1min 31s

Graph the data¶

In [86]:
# Plotting the learning curve
plt.figure(figsize=(15, 5))
plt.errorbar(lst_training_proportion_LOGISTIC_REGRESSION, lst_eval_scores_TEST_SET_LOGISTIC_REGRESSION, yerr=lst_std_err_TEST_SET_LOGISTIC_REGRESSION, fmt='-o', capsize=3, marker='o', label="Logistic Regression")
plt.errorbar(lst_training_proportion_LOGISTIC_REGRESSION, lst_eval_scores_TEST_SET_NBC, yerr=lst_std_err_TEST_SET_NBC, fmt='-o', capsize=3, marker='X', label="NBC")
plt.errorbar(lst_training_proportion_LOGISTIC_REGRESSION, lst_eval_scores_TEST_SET_SVC, yerr=lst_std_err_TEST_SET_SVC, fmt='-o', capsize=3, marker='>', label="SVC")
plt.errorbar(lst_training_proportion_LOGISTIC_REGRESSION, lst_eval_scores_TEST_SET_PERCEPTRON, yerr=lst_std_err_TEST_SET_PERCEPTRON, fmt='-o', capsize=3, marker='<', label="Perceptron")

# Fill the area between the upper and lower bounds of the error bars
plt.fill_between(lst_training_proportion_LOGISTIC_REGRESSION, np.array(lst_eval_scores_TEST_SET_PERCEPTRON) - np.array(lst_std_err_TEST_SET_PERCEPTRON),
                 np.array(lst_eval_scores_TEST_SET_PERCEPTRON) + np.array(lst_std_err_TEST_SET_PERCEPTRON),
                 color='red', alpha=0.2, label="Perceptron Model Std Error Area")

plt.xlabel('Training Size Proportion')
plt.ylabel('Average Accuracy')
plt.title('Learning Curve for Test Set' )
#plt.legend(['NBC Test Set', 'Perceptron Test Set', 'Decision Tree Test Set'])
plt.legend(title="Model")
plt.grid(True)
plt.show()
No description has been provided for this image

Phase 5: Deploy Model

In [87]:
i = 0
instance = X_test.iloc[[i]]
prediction =  model_LR.predict(instance)
ground_truth = y_test.iloc[i]
prediction_correct = (prediction == ground_truth)[0]

print('instance index:', i, ' prediction:', prediction, ' ground truth:', ground_truth, ' prediction correct:', prediction_correct)
instance index: 0  prediction: ['Iris-setosa']  ground truth: Iris-setosa  prediction correct: True

Phase 3: Modeling and Classification UPDATE Using Standardization

execute learning and evaluation into a function such that we can streamline evaluation of multiple models¶

standardize the data¶

In [88]:
# we typically standardize our data where values in each column are scaled to a mean of 0. Then the z-score of each value represents a magnitude of how many standard deviations it is from the mean.
# standardization helps us to achieve convergence and reduce values from a particular feature being too dominant in our learning process
In [89]:
# recall, the distribution estimate of our values is the following
In [90]:
df_iris.drop('class_label', axis=1).plot(kind='kde');
No description has been provided for this image

observation: notice this data is not Z-Score standardized

In [91]:
# instantiate scaler
scaler = StandardScaler()

# fit the data to the scaler
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)
In [92]:
df_scaled = pd.DataFrame(X_train_scaled, columns=lst_attributes)
df_scaled.plot(kind='kde');
No description has been provided for this image

observation: although not perfect, the density estimates are now better aligned at mean = 0

In [93]:
sns.kdeplot(data=df_scaled, fill=True);
No description has been provided for this image
In [94]:
df_scaled.plot(kind="box");
No description has been provided for this image
In [95]:
sns.boxplot(data=df_scaled);
sns.swarmplot(data=df_scaled, color='black');
plt.show()
No description has been provided for this image
In [96]:
# Create a 2x2 subplot layout
df_scaled_copy = df_scaled.copy()
df_scaled_copy['category'] = df_iris['category']

# set data to plot
df_to_plot = df_scaled_copy.copy()

fig, axes = plt.subplots(2, 2, figsize=(14, 14))
axes = axes.flatten()

# Plot boxplot and swarmplot for each feature
for i, feature in enumerate(lst_attributes):
    sns.boxplot(ax=axes[i], data=df_to_plot, x='category', y=feature)
    sns.swarmplot(ax=axes[i], data=df_to_plot, x='category', y=feature)
    #axes[i].set_title(f'{feature.capitalize()} by category')
    axes[i].set_title(f'{feature} by category')
    axes[i].set_xlabel('category')
    axes[i].set_ylabel(feature)


plt.subplots_adjust(wspace=0.3, hspace=0.4)
#plt.tight_layout()
plt.show()
No description has been provided for this image
In [97]:
# Set the figure size
plt.figure(figsize=(15, 15))

# set data to plot
df_to_plot = df_scaled_copy.copy()

# List of dependent variables to plot
variables = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

# Loop through each subplot
for i, feature in enumerate(variables):
    plt.subplot(2,2,i+1)
    sns.violinplot(x='category', y=feature, data=df_to_plot)
    plt.title(feature)

# Adjust layout to prevent overlap
#plt.tight_layout()
plt.subplots_adjust(wspace=0.3, hspace=0.2)
plt.show()
No description has been provided for this image

combine subplots for boxplot and violin plots¶

In [98]:
# Set the figure size
plt.figure(figsize=(15, 15))

# set data to plot
df_to_plot = df_scaled_copy.copy()

# List of dependent variables to plot
variables = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

# Loop through each subplot
for i, feature in enumerate(variables):
    plt.subplot(2,2,i+1)    
    sns.violinplot(x='category', y=feature, data=df_to_plot)
    #sns.boxplot(data=df_to_plot, x='category', y=feature)
    sns.swarmplot(data=df_to_plot, x='category', y=feature)
    
    plt.title(feature)

# Adjust layout to prevent overlap
#plt.tight_layout()
plt.subplots_adjust(wspace=0.3, hspace=0.2)
plt.show()
No description has been provided for this image
In [99]:
# set data to plot
df_to_plot = df_scaled_copy.copy()

# Set up the figure and axes
n_features = len(lst_attributes)
fig, axes = plt.subplots(n_features, 2, figsize=(13, 5 * n_features))

# Loop through each feature and plot the boxplot, swarmplot, and violin plot
for i, feature in enumerate(lst_attributes):
    # Plot boxplot and swarmplot
    sns.boxplot(ax=axes[i, 0], data=df_to_plot, x='category', y=feature)
    sns.swarmplot(ax=axes[i, 0], data=df_to_plot, x='category', y=feature)  
    axes[i, 0].set_title(f'{feature} by category')
    axes[i, 0].set_xlabel('category')
    axes[i, 0].set_ylabel(feature)

# Plot violin plot
    sns.violinplot(ax=axes[i, 1], x='category', y=feature, data=df_to_plot)
    sns.swarmplot(ax=axes[i, 1], data=df_to_plot, x='category', y=feature)  
    axes[i, 1].set_title(f'{feature} by category')
    axes[i, 1].set_xlabel('category')
    axes[i, 1].set_ylabel(feature)

# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()
No description has been provided for this image

HELPER FUNCTIONS ¶

Get Time¶

In [100]:
def get_time():
    formatted_time = ""

    try:
        now = datetime.datetime.now()

        year = now.year
        month = now.month
        day = now.day
        hour = now.hour
        min = now.minute

        formatted_time = "{}-{:02}-{:02}-{:02}{:02}".format(year, month, day, hour, min)

    except Exception as error:
        print_exception(error, inspect.currentframe().f_code.co_name, False)

    return formatted_time

Calculate Time Duration¶

In [101]:
def calculate_time_duration(datetime_duration):
    formatted_time = ""

    try:
        days = datetime_duration.days
        hours = datetime_duration.seconds // 3600
        minutes = (datetime_duration.seconds % 3600) // 60
        # seconds = str(divmod(datetime_duration.seconds, 1))
        seconds = datetime_duration.seconds % 60

        formatted_time = f"{days} day(s), {hours} hour(s), {minutes} minute(s), {seconds} second(s)"

    except Exception as error:
        print_exception(error, inspect.currentframe().f_code.co_name, False)

    return formatted_time

Display Sum NaN Function¶

In [102]:
def display_NaN_columns(dataframe):
    try:
        
        lst_nan_sum = dataframe.isna().sum()

        filtered_cols_with_na_sum_greater_than_zero = lst_nan_sum[lst_nan_sum > 0]

        if len(filtered_cols_with_na_sum_greater_than_zero) < 1:
            print("NO COLS WITH NaN!!!!!!!!!!!!")
        else:
            print(f'Columns with NaN: {len(filtered_cols_with_na_sum_greater_than_zero)}')
            print(filtered_cols_with_na_sum_greater_than_zero)
        
        
    except Exception as error:
        print_exception(error, inspect.currentframe().f_code.co_name, False) 

Check Unique Cols¶

In [103]:
def print_nunique(SERIES, num_columns):
    # iterate through df
    if num_columns < 1:
        num_columns = 1
    try:
        count = 0
        for col, vol in SERIES.items():
            print(f'{col}:  {vol}', end='\t\t\t\t')
            count += 1
            
            if count % num_columns == 0:
                print('')
            
    except Exception as error:
        print_exception(error, inspect.currentframe().f_code.co_name, False)

Feature Importance¶

In [104]:
# this is a helper function for 
lst_coef_models = ['logistic_regression', 'perceptron', 'sgd_classifier', 'passive_aggressive', 'ridge_classifier', 'linear_svc', 'svc', 'svc_nu']
lst_all_feature_importance = ['bagging_classifier']
lst_permutation_importance = ['nbc_gaussian', 'knn' , 'ada_boosting', 'quadratic_discriminant', 'linear_discriminant', 'hist_boosting', 'gaussian_process', 'mlp', 'nbc_bernoulli']

def display_feature_importance_chart(model, model_name, lst_feature_names, num_coefficients_to_display, figure_save_name_ok_to_be_null):

    # not all functions have the same protocol to extract feature importance coefficients
    if model_name in lst_coef_models:
        ftr_importance = np.abs(model.coef_[0])
        
    elif model_name in lst_all_feature_importance:
        all_feature_importances = np.array([tree.feature_importances_ for tree in model.estimators_])
    
        # Average the feature importances across all trees
        mean_feature_importances = np.mean(all_feature_importances, axis=0)
    
        ftr_importance = mean_feature_importances
    
    elif model_name in lst_permutation_importance:
        results = permutation_importance(model, X_test, y_test, scoring='accuracy', n_repeats=10, random_state=42)
    
        # Get feature importances
        importance = results.importances_mean
        ftr_importance = np.abs(importance)    
        
    #elif model_name in lst_log_prob:
    #    ftr_importance = model.feature_log_prob_
    
    else:
        ftr_importance = model.feature_importances_


    return display_feature_importance(model_name, lst_feature_names, ftr_importance, num_coefficients_to_display, figure_save_name_ok_to_be_null)
    
In [105]:
#display feature importance
def display_feature_importance(model_name, list_col_names, feature_importances, num_features, figure_save_name_ok_to_be_null):
    try:
        # make directory
        os.makedirs('./feature_importance', exist_ok=True)
        os.makedirs('./feature_importance_data', exist_ok=True)

        
        #plot importances
        feature_names = list_col_names
        importances = feature_importances
        indices = np.argsort(importances)

        # save coefficients to file
        # reverse indices
        reverse_indices = indices[::-1]
        try:
            if figure_save_name_ok_to_be_null is not None and len(figure_save_name_ok_to_be_null) > 0:
                # Export to a file
                with open(str('./feature_importance_data/' + figure_save_name_ok_to_be_null) + '.csv', 'w') as file:
                    # write header
                    file.write(str(model_name) + '_feature,importance_coefficient\n')
                    
                    # write values
                    for i in reverse_indices:
                        file.write(str(list_col_names[i]) + "," + str(importances[i]) + '\n')
        except:
            pass
        

        # reduce the number of values to print to cell
        if num_features > 0:
            indices = np.argsort(importances)[-num_features:]

        
        figure_height = math.ceil(num_features / 2)
        
        if figure_height < 1:
            figure_height = math.ceil(len(feature_importances) / 3)
        
        plt.figure(figsize=(15, figure_height))
        plt.title("Feature Importances for model [" + str(model_name) + "]")
        plt.barh(range(len(indices)), importances[indices], align="center", edgecolor='black')
        plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
        plt.xlabel("Relative Importance")

        # ensure all text fits within figure
        plt.tight_layout()
        
        try:
          if figure_save_name_ok_to_be_null is not None and len(figure_save_name_ok_to_be_null) > 0:
              # Save the figure
              plt.savefig('./feature_importance/' + str(figure_save_name_ok_to_be_null) + '.png', bbox_inches='tight')    # add 'tight' to ensure all text fits inside the figure
        except:
            pass
            
        #plt.show()
        
                
            
        
        #display list of coefficients
        print("coefficients:\n====================")
        #count = 0
        #for i,v in enumerate(importances):
        #    print('%s:\t %.5f' % (list_col_names[i],v))
        #    count += 1
        #    if num_features > 0 and count > num_features:
        #        break

        # resort and take limit number of expected output features
        if num_features > 0:
            reverse_indices = indices[::-1]
            
        #print
        for i in reverse_indices:
            print(str(list_col_names[i]) + "\t" + str(importances[i]))

        
    
    except Exception as error:
        print(error)
        print(inspect.currentframe().f_code.co_name)

Instantiate Models

In [106]:
RANDOM_STATE = 42

dict_models = {
# linear models
'logistic_regression': LogisticRegression(random_state=RANDOM_STATE),
'perceptron': linear_model.Perceptron(random_state=RANDOM_STATE),
'sgd_classifier': SGDClassifier(random_state=RANDOM_STATE),
'passive_aggressive': linear_model.PassiveAggressiveClassifier(random_state=RANDOM_STATE),
'ridge_classifier': RidgeClassifier(),


# SVM
'linear_svc': svm.LinearSVC(random_state=RANDOM_STATE),
'svc': svm.SVC(probability=True, kernel='linear', random_state=RANDOM_STATE),
'svc_nu': svm.NuSVC(probability=True, kernel='linear', random_state=RANDOM_STATE),

# tree
'decision_tree': DecisionTreeClassifier(random_state=RANDOM_STATE), #random_state=42

    
# ensemble
'random_forest': RandomForestClassifier(random_state=RANDOM_STATE), # random_state=42
'extra_tree': ExtraTreesClassifier(random_state=RANDOM_STATE),
'bagging_classifier': BaggingClassifier(random_state=RANDOM_STATE),
'gradient_boosting': ensemble.GradientBoostingClassifier(random_state=RANDOM_STATE), # random_state=42
'ada_boosting': ensemble.AdaBoostClassifier(random_state=RANDOM_STATE),
'hist_boosting': HistGradientBoostingClassifier(random_state=RANDOM_STATE),
#'xgb': xgboost.XGBClassifier(),

   

# Gaussian Processes
'gaussian_process': gaussian_process.GaussianProcessClassifier(random_state=RANDOM_STATE),


# NBC probabilistic
'nbc_gaussian': GaussianNB(),
'nbc_bernoulli': naive_bayes.BernoulliNB(),
#'nbc_multinomial': naive_bayes.MultinomialNB(),
#'nbc_categorical': naive_bayes.CategoricalNB(),

    
# neighbors
'knn': KNeighborsClassifier(),
#'radius_neighbors': neighbors.RadiusNeighborsClassifier(random_state=RANDOM_STATE),

    
# discriminant analysis
'quadratic_discriminant': QuadraticDiscriminantAnalysis(),
'linear_discriminant': LinearDiscriminantAnalysis(),

# neural network
'mlp': MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=RANDOM_STATE)

# NOTE: there are many more we can add, this is just a starting point!
}

Classify Dataset!!!¶

In [107]:
def classify_dataset(mdl_name, mdl, X_trn, X_tst, y_trn, y_tst, class_lbls):
    analysis_start_time_text = "not started"
    analysis_end_time_text = "not started"
    
    analysis_start_time = datetime.datetime.now()
    analysis_start_time_text = get_time()
    
    # make directory
    os.makedirs('./classification_report', exist_ok=True)

    # notify
    print(f'\ntraining model: {mdl_name}...')

    ##################################################
    # train the model
    ##################################################
    mdl.fit(X_trn, y_trn)

    ##################################################
    # predict test set
    ##################################################
    y_prd = mdl.predict(X_tst)

    ##################################################
    # evaluate model
    ##################################################
    
    # apply trained model to test set to determine how well it might generalize to unseen data in the future
    
    accuracy_mdl = accuracy_score(y_tst, y_prd) # can also do accuracy_mdl = mdl.score(X_test, y_tst)
    precision_mdl = precision_score(y_tst, y_prd, average='weighted')
    recall_mdl = recall_score(y_tst, y_prd, average='weighted')
    f1_score_mdl = f1_score(y_tst, y_prd, average='weighted')
    
    cv_score_accuracy_mdl = cross_val_score(mdl, X_trn, y_trn, cv=10, scoring='accuracy', n_jobs=-1)[0]
    cv_score_precision_mdl = cross_val_score(mdl, X_trn, y_trn, cv=10, scoring='precision_macro', n_jobs=-1)[0]
    cv_score_recall_mdl = cross_val_score(mdl, X_trn, y_trn, cv=10, scoring='recall_macro', n_jobs=-1)[0]
    cv_score_f1_score_mdl = cross_val_score(mdl, X_trn, y_trn, cv=10, scoring='f1_macro', n_jobs=-1)[0]
    
    confusion_mtx_mdl = confusion_matrix(y_tst, y_prd)
    
    
    try:
        # convert classification report to dictionary to then convert to df
        class_report_dict_mdl = classification_report(y_tst, y_prd, output_dict=True)  # This might raise an error
    
        # Convert dictionary to df
        df_class_report_mdl = pd.DataFrame(class_report_dict_mdl).transpose()
    
         # write to disk!
        df_class_report_mdl.to_csv('classification_report/classification_report_' + mdl_name + '.csv', index=True)
    except:
        print('Exception caught in classification report on model:', mdl_name)


    
    
    #########################################################
    # analysis summary
    #########################################################
    analysis_end_time = datetime.datetime.now()
    analysis_end_time_text = get_time()
    
    analysis_duration = analysis_end_time - analysis_start_time
    analysis_duration_text = calculate_time_duration(analysis_duration)  
    
    print('\nDone!\n')
    print(f'cv_precision: {cv_score_precision_mdl:.4f}', end='\t')
    print(f'cv_recall: {cv_score_recall_mdl:.4f}', end='\t')
    print(f'cv_f1_score: {cv_score_f1_score_mdl:.4f}', end='\t')
    print(f'accuracy: {accuracy_mdl:.4f}')
    print("Start Time:\t\t" + analysis_start_time_text)
    print("End Time:\t\t" + analysis_end_time_text)
    print("Analysis Duration:\t" + analysis_duration_text)    
    print('============================================================================================================')

    return (accuracy_mdl, precision_mdl, recall_mdl, f1_score_mdl, cv_score_accuracy_mdl, cv_score_precision_mdl, cv_score_recall_mdl, cv_score_f1_score_mdl, confusion_mtx_mdl, df_class_report_mdl,
           analysis_start_time_text, analysis_end_time_text, analysis_duration_text)

Train & Evaluate Models¶

In [108]:
#%%time

analysis_start_EVAL_text = "not started"
analysis_end_EVAL_text = "not started"

analysis_start_EVAL = datetime.datetime.now()
analysis_start_EVAL_text = get_time()

dict_trained_model_data = dict()

######################################################
# create dataframe to store eval scores
######################################################
lst_eval_score_cols = ['precision', 'recall', 'f1_score', 'accuracy']
df_evaluation_score = pd.DataFrame(columns=lst_eval_score_cols)



######################################################
# INVOKE MODELS TO LEARN THE DATA
######################################################
for key in dict_models.keys():
    mdl_name = key
    mdl = dict_models[key]

    # train and evaluate each model!
    #(accuracy_mdl, precision_mdl, recall_mdl, f1_score_mdl, cv_score_accuracy_mdl, cv_score_precision_mdl, cv_score_recall_mdl, cv_score_f1_score_mdl, confusion_mtx_mdl, df_class_report_mdl, \
    #       analysis_start_time_text, analysis_end_time_text, analysis_duration_text) = classify_dataset(mdl_name, mdl, X_train_scaled, X_test_scaled, y_train, y_test, lst_unique_class_names)

    # store the trained model
    dict_trained_model_data[mdl_name] = classify_dataset(mdl_name, mdl, X_train_scaled, X_test_scaled, y_train, y_test, lst_unique_class_names)

    cv_score_precision_mdl  = dict_trained_model_data[mdl_name][5]
    cv_score_recall_mdl = dict_trained_model_data[mdl_name][6]
    cv_score_f1_score_mdl = dict_trained_model_data[mdl_name][7]
    cv_score_accuracy_mdl = dict_trained_model_data[mdl_name][4]

    # append eval score to dataframe
    instance_evaluation = {
        'model': mdl_name, 
        'precision': cv_score_precision_mdl, 
        'recall': cv_score_recall_mdl, 
        'f1_score': cv_score_f1_score_mdl,
        'accuracy': cv_score_accuracy_mdl    }
    
    df_evaluation_score.loc[mdl_name] = instance_evaluation 
    
    print('============================================================================================================\n\n')

# notify
print('******************************************************************************************************************')
print('******************************************************************************************************************')
print('******************************************************************************************************************')
print("\nLEARNING COMPLETE!!!\n========================================")
analysis_end_EVAL = datetime.datetime.now()
analysis_end_EVAL_text = get_time()

analysis_duration_FINAL = analysis_end_EVAL - analysis_start_EVAL
analysis_duration_EVAL = calculate_time_duration(analysis_duration_FINAL)  

print("Trained model count:\t" + str(len(dict_models)))
print("Start Time:\t\t" + analysis_start_EVAL_text)
print("End Time:\t\t" + analysis_end_EVAL_text)
print("Analysis Duration:\t" + analysis_duration_EVAL)     
training model: logistic_regression...

Done!

cv_precision: 0.8056	cv_recall: 0.8056	cv_f1_score: 0.8056	accuracy: 0.8667
Start Time:		2025-02-07-2038
End Time:		2025-02-07-2038
Analysis Duration:	0 day(s), 0 hour(s), 0 minute(s), 0 second(s)
============================================================================================================
============================================================================================================



training model: perceptron...

Done!

cv_precision: 0.8611	cv_recall: 0.8333	cv_f1_score: 0.8110	accuracy: 0.8000
Start Time:		2025-02-07-2038
End Time:		2025-02-07-2038
Analysis Duration:	0 day(s), 0 hour(s), 0 minute(s), 0 second(s)
============================================================================================================
============================================================================================================



training model: sgd_classifier...

Done!

cv_precision: 0.8056	cv_recall: 0.8056	cv_f1_score: 0.8056	accuracy: 0.9333
Start Time:		2025-02-07-2038
End Time:		2025-02-07-2038
Analysis Duration:	0 day(s), 0 hour(s), 0 minute(s), 0 second(s)
============================================================================================================
============================================================================================================



training model: passive_aggressive...

Done!

cv_precision: 0.5238	cv_recall: 0.6667	cv_f1_score: 0.5758	accuracy: 0.7333
Start Time:		2025-02-07-2038
End Time:		2025-02-07-2038
Analysis Duration:	0 day(s), 0 hour(s), 0 minute(s), 0 second(s)
============================================================================================================
============================================================================================================



training model: ridge_classifier...

Done!

cv_precision: 0.4667	cv_recall: 0.5000	cv_f1_score: 0.4815	accuracy: 0.8667
Start Time:		2025-02-07-2038
End Time:		2025-02-07-2038
Analysis Duration:	0 day(s), 0 hour(s), 0 minute(s), 0 second(s)
============================================================================================================
============================================================================================================



training model: linear_svc...

Done!

cv_precision: 0.8056	cv_recall: 0.8056	cv_f1_score: 0.8056	accuracy: 0.9333
Start Time:		2025-02-07-2038
End Time:		2025-02-07-2038
Analysis Duration:	0 day(s), 0 hour(s), 0 minute(s), 0 second(s)
============================================================================================================
============================================================================================================



training model: svc...

Done!

cv_precision: 0.8056	cv_recall: 0.8056	cv_f1_score: 0.8056	accuracy: 0.8667
Start Time:		2025-02-07-2038
End Time:		2025-02-07-2038
Analysis Duration:	0 day(s), 0 hour(s), 0 minute(s), 0 second(s)
============================================================================================================
============================================================================================================



training model: svc_nu...

Done!

cv_precision: 0.9333	cv_recall: 0.8889	cv_f1_score: 0.8963	accuracy: 0.8000
Start Time:		2025-02-07-2038
End Time:		2025-02-07-2038
Analysis Duration:	0 day(s), 0 hour(s), 0 minute(s), 0 second(s)
============================================================================================================
============================================================================================================



training model: decision_tree...

Done!

cv_precision: 0.8056	cv_recall: 0.8056	cv_f1_score: 0.8056	accuracy: 0.8667
Start Time:		2025-02-07-2038
End Time:		2025-02-07-2038
Analysis Duration:	0 day(s), 0 hour(s), 0 minute(s), 0 second(s)
============================================================================================================
============================================================================================================



training model: random_forest...

Done!

cv_precision: 0.8056	cv_recall: 0.8056	cv_f1_score: 0.8056	accuracy: 1.0000
Start Time:		2025-02-07-2038
End Time:		2025-02-07-2038
Analysis Duration:	0 day(s), 0 hour(s), 0 minute(s), 8 second(s)
============================================================================================================
============================================================================================================



training model: extra_tree...

Done!

cv_precision: 0.8056	cv_recall: 0.8056	cv_f1_score: 0.8056	accuracy: 0.8667
Start Time:		2025-02-07-2038
End Time:		2025-02-07-2038
Analysis Duration:	0 day(s), 0 hour(s), 0 minute(s), 6 second(s)
============================================================================================================
============================================================================================================



training model: bagging_classifier...

Done!

cv_precision: 0.8056	cv_recall: 0.8056	cv_f1_score: 0.8056	accuracy: 0.8000
Start Time:		2025-02-07-2038
End Time:		2025-02-07-2038
Analysis Duration:	0 day(s), 0 hour(s), 0 minute(s), 1 second(s)
============================================================================================================
============================================================================================================



training model: gradient_boosting...

Done!

cv_precision: 0.8056	cv_recall: 0.8056	cv_f1_score: 0.8056	accuracy: 0.8667
Start Time:		2025-02-07-2038
End Time:		2025-02-07-2039
Analysis Duration:	0 day(s), 0 hour(s), 0 minute(s), 13 second(s)
============================================================================================================
============================================================================================================



training model: ada_boosting...

Done!

cv_precision: 0.8056	cv_recall: 0.8056	cv_f1_score: 0.8056	accuracy: 0.8667
Start Time:		2025-02-07-2039
End Time:		2025-02-07-2039
Analysis Duration:	0 day(s), 0 hour(s), 0 minute(s), 5 second(s)
============================================================================================================
============================================================================================================



training model: hist_boosting...

Done!

cv_precision: 0.8056	cv_recall: 0.8056	cv_f1_score: 0.8056	accuracy: 0.8000
Start Time:		2025-02-07-2039
End Time:		2025-02-07-2039
Analysis Duration:	0 day(s), 0 hour(s), 0 minute(s), 5 second(s)
============================================================================================================
============================================================================================================



training model: gaussian_process...

Done!

cv_precision: 0.7222	cv_recall: 0.7222	cv_f1_score: 0.7143	accuracy: 0.8000
Start Time:		2025-02-07-2039
End Time:		2025-02-07-2039
Analysis Duration:	0 day(s), 0 hour(s), 0 minute(s), 1 second(s)
============================================================================================================
============================================================================================================



training model: nbc_gaussian...

Done!

cv_precision: 0.7222	cv_recall: 0.7222	cv_f1_score: 0.7143	accuracy: 0.8667
Start Time:		2025-02-07-2039
End Time:		2025-02-07-2039
Analysis Duration:	0 day(s), 0 hour(s), 0 minute(s), 0 second(s)
============================================================================================================
============================================================================================================



training model: nbc_bernoulli...

Done!

cv_precision: 0.8667	cv_recall: 0.8333	cv_f1_score: 0.8056	accuracy: 0.6667
Start Time:		2025-02-07-2039
End Time:		2025-02-07-2039
Analysis Duration:	0 day(s), 0 hour(s), 0 minute(s), 0 second(s)
============================================================================================================
============================================================================================================



training model: knn...

Done!

cv_precision: 0.8056	cv_recall: 0.8056	cv_f1_score: 0.8056	accuracy: 0.8000
Start Time:		2025-02-07-2039
End Time:		2025-02-07-2039
Analysis Duration:	0 day(s), 0 hour(s), 0 minute(s), 0 second(s)
============================================================================================================
============================================================================================================



training model: quadratic_discriminant...

Done!

cv_precision: 0.9167	cv_recall: 0.9167	cv_f1_score: 0.9048	accuracy: 0.9333
Start Time:		2025-02-07-2039
End Time:		2025-02-07-2039
Analysis Duration:	0 day(s), 0 hour(s), 0 minute(s), 0 second(s)
============================================================================================================
============================================================================================================



training model: linear_discriminant...

Done!

cv_precision: 0.8056	cv_recall: 0.8056	cv_f1_score: 0.8056	accuracy: 0.8667
Start Time:		2025-02-07-2039
End Time:		2025-02-07-2039
Analysis Duration:	0 day(s), 0 hour(s), 0 minute(s), 0 second(s)
============================================================================================================
============================================================================================================



training model: mlp...

Done!

cv_precision: 0.8056	cv_recall: 0.8056	cv_f1_score: 0.8056	accuracy: 0.8667
Start Time:		2025-02-07-2039
End Time:		2025-02-07-2039
Analysis Duration:	0 day(s), 0 hour(s), 0 minute(s), 12 second(s)
============================================================================================================
============================================================================================================


******************************************************************************************************************
******************************************************************************************************************
******************************************************************************************************************

LEARNING COMPLETE!!!
========================================
Trained model count:	22
Start Time:		2025-02-07-2038
End Time:		2025-02-07-2039
Analysis Duration:	0 day(s), 0 hour(s), 0 minute(s), 59 second(s)

view performance of each model¶

In [109]:
# sort df by f1_score
df_evaluation_score = df_evaluation_score.sort_values(by='f1_score', ascending=False)
df_evaluation_score
Out[109]:
precision recall f1_score accuracy
quadratic_discriminant 0.916667 0.916667 0.904762 0.916667
svc_nu 0.933333 0.888889 0.896296 0.916667
perceptron 0.861111 0.833333 0.810967 0.833333
logistic_regression 0.805556 0.805556 0.805556 0.833333
linear_discriminant 0.805556 0.805556 0.805556 0.833333
knn 0.805556 0.805556 0.805556 0.833333
nbc_bernoulli 0.866667 0.833333 0.805556 0.833333
hist_boosting 0.805556 0.805556 0.805556 0.833333
ada_boosting 0.805556 0.805556 0.805556 0.833333
gradient_boosting 0.805556 0.805556 0.805556 0.833333
bagging_classifier 0.805556 0.805556 0.805556 0.833333
extra_tree 0.805556 0.805556 0.805556 0.833333
random_forest 0.805556 0.805556 0.805556 0.833333
decision_tree 0.805556 0.805556 0.805556 0.833333
svc 0.805556 0.805556 0.805556 0.833333
linear_svc 0.805556 0.805556 0.805556 0.833333
sgd_classifier 0.805556 0.805556 0.805556 0.833333
mlp 0.805556 0.805556 0.805556 0.833333
gaussian_process 0.722222 0.722222 0.714286 0.750000
nbc_gaussian 0.722222 0.722222 0.714286 0.750000
passive_aggressive 0.523810 0.666667 0.575758 0.750000
ridge_classifier 0.466667 0.500000 0.481481 0.583333

Select model¶

In [110]:
top_model_eval_instance = df_evaluation_score.iloc[0]
#top_model_eval_instance = df_evaluation_score.iloc[1] #override
top_model_name = top_model_eval_instance.name
print(f'\nTOP MODEL: {top_model_name}\n {top_model_eval_instance}  ')

# get trained model
mdl_trained_top_model = dict_models[top_model_name]
confusion_mtx_top_model = dict_trained_model_data[top_model_name][8]
TOP MODEL: quadratic_discriminant
 precision    0.916667
recall       0.916667
f1_score     0.904762
accuracy     0.916667
Name: quadratic_discriminant, dtype: float64  
In [111]:
confusion_mtx_top_model
Out[111]:
array([[4, 0, 0],
       [0, 4, 0],
       [0, 1, 6]])

Create Confusion Matrix to view Model Classification¶

In [112]:
# create confusion matrix
display_confusion_matrix(top_model_name, confusion_mtx_top_model, lst_unique_class_names)
No description has been provided for this image

Determine Feature Importance for Selected Model¶

In [113]:
ftr_importance = None
mdlName = top_model_name
model = dict_models[mdlName]

display_feature_importance_chart(model, mdlName, list(X_train.columns), 10, 'feature_importance_' + str(mdlName))
coefficients:
====================
petal_length	0.01333333333333333
petal_width	0.0
sepal_width	0.0
sepal_length	0.0
No description has been provided for this image

Finally, Deploy the selected model¶

In [114]:
i = 0
instance = X_train.iloc[[i]]
prediction =  mdl_trained_top_model.predict(instance)
ground_truth = y_test.iloc[i]
prediction_correct = (prediction == ground_truth)[0]

print('instance index:', i, ' prediction:', prediction, ' ground truth:', ground_truth, ' prediction correct:', prediction_correct)
instance index: 0  prediction: ['Iris-virginica']  ground truth: Iris-setosa  prediction correct: False
In [115]:
# remember to scale the data
i = 0
instance = X_train_scaled[[i]]
prediction =  mdl_trained_top_model.predict(instance)
ground_truth = y_test.iloc[i]
prediction_correct = (prediction == ground_truth)[0]

print('instance index:', i, ' prediction:', prediction, ' ground truth:', ground_truth, ' prediction correct:', prediction_correct)
instance index: 0  prediction: ['Iris-setosa']  ground truth: Iris-setosa  prediction correct: True

defining a new instance¶

In [116]:
instance = np.array([[7.7, 3.1, 5.1, 1.8]])
instance
Out[116]:
array([[7.7, 3.1, 5.1, 1.8]])

scaling the instance¶

In [117]:
scaled_instance = scaler.transform(instance)
scaled_instance
Out[117]:
array([[1.81813736, 0.30716727, 0.58392945, 0.55594151]])

predicting the instance classificaiton¶

In [118]:
# if the data is not standardized
mdl_trained_top_model.predict(instance)[0]

# if the data is standardized
mdl_trained_top_model.predict(scaled_instance)[0]
Out[118]:
'Iris-versicolor'